# coding=utf-8

import html
import xml.etree.ElementTree as ET

from red import *
from worker_manage import worker, dataparser

from fetcher import *
from datadefine import *

__all__ = ()

def de_html_char(text):
    '''去掉html转义'''

    # 去标签
    text = red.sub(r'''<.*?>''', r'', text)

    # 去转义
    text = html.unescape(text)

    # 0xe38080转空格
    text = text.replace('　', ' ')

    # 空白
    text = text.replace('\n', ' ')
    text = red.sub(r'\s{3,}', r'  ', text)
    text = text.strip()

    return text

# 处理映射规则
def map_attrs(m, one):
    if type(one) == int:
        return de_html_char(m.group(one))
    elif type(one) in (tuple, list):
        s = ''
        for i in one:
            if type(i) == int:
                s += de_html_char(m.group(i))
            elif type(i) == str:
                s += i
        return s
    else:
        print('map_rule的定义出现错误')

def parse_html(data_dict, html):
    if not html:
        raise c_worker_exception('html为空字符串', data_dict['url'], '')

    re_lst = data_dict['blocks_list']
    ret = list()

    for i, block in enumerate(re_lst):

        # block re
        if block[0] == None:
            subhtml = html
        else:
            block_prog = red.d(block[0][0], block[0][1])
            itr = block_prog.finditer(html)
            matches = list(itr)
            if len(matches) != 1:
                s = '第%d个block_re找到的结果为%d' % (i, len(matches))
                raise c_worker_exception(s, '', '应为1')
            subhtml = matches[0].group(1)

        # item re
        item_prog = red.d(block[1][0], block[1][1])
        itr = item_prog.finditer(subhtml)
        matches = list(itr)
        if not matches:
            s = '第%d个item_re找到的结果为0' % i
            raise c_worker_exception(s, '', '应大于0')

        for m in matches:
            info = c_info()

            for k, v in block[2].items():
                try:
                    ss = map_attrs(m, v)
                except Exception as e:
                    s = '处理第%d个map_rule时异常' % i
                    raise c_worker_exception(s, '', str(e))

                if k == 'title':
                    info.title = ss
                elif k == 'url':
                    info.url = ss
                elif k == 'summary':
                    info.summary = ss
                elif k == 'author':
                    info.author = ss
                elif k == 'pub_date':
                    info.pub_date = ss
                elif k == 'suid':
                    info.suid = ss
                else:
                    print('无法处理map_rule', k, v)

                if not info.suid:
                    info.suid = info.url

            ret.append(info)
     
    return ret   


# download and parse
@worker('html_re')
def download_process(data_dict, worker_dict):
    url = data_dict['url']
    encoding = data_dict.get('encoding', '')

    f = Fetcher()
    string = f.fetch_html(url, encoding)

    return parse_html(data_dict, string)


def process_multiline(string):
    ret = ''

    lines = string.strip().split('\n')
    for line in lines:
        ret += line.strip()

    return ret

def process_flags(string):
    def is_this(upper_flag, s1, s2):
        if upper_flag == s1 or upper_flag == s2:
            return True
        else:
            return False

    flags = string.strip().split()

    ret = 0
    for flag in flags:
        f = flag.upper()

        if is_this(f, 'ASCII', 'A'):
            ret |= red.ASCII
        elif 'DEBUG' == f:
            ret |= red.DEBUG
        elif is_this(f, 'IGNORECASE', 'I'):
            ret |= red.IGNORECASE
        elif is_this(f, 'LOCALE', 'L'):
            ret |= red.LOCALE
        elif is_this(f, 'MULTILINE', 'M'):
            ret |= red.MULTILINE
        elif is_this(f, 'DOTALL', 'S'):
            ret |= red.DOTALL
        elif is_this(f, 'VERBOSE', 'X'):
            ret |= red.VERBOSE

    return ret

@dataparser('html_re')
def html_re_parser(xml_string):
    d = dict()
    data = ET.fromstring(xml_string).find('data')

    url_tag = data.find('url')
    if url_tag != None:
        d['url'] = url_tag.text.strip()
        d['encoding'] = url_tag.attrib.get('encoding', '').strip()

    blocks = data.findall('block')
    if blocks:
        block_list = list()

        for block in blocks:
            blockre = block.find('blockre')
            blockre_re = process_multiline(blockre.text)
            blockre_flags = process_flags(blockre.attrib.get('flags', ''))

            itemre = block.find('itemre')
            itemre_re = process_multiline(itemre.text)
            itemre_flags = process_flags(itemre.attrib.get('flags', ''))
            
            map_dict = dict()
            maprules = block.find('maprules')
            for r in maprules.iter():
                if r.tag != 'maprules':
                    value = '(' + r.text.strip() + ')'
                    map_dict[r.tag] = eval(value)

            tu = ((blockre_re, blockre_flags),
                  (itemre_re, itemre_flags),
                  map_dict
                  )
            block_list.append(tu)

        d['blocks_list'] = block_list

    return d